# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,cross_val_score,KFold
from imblearn.over_sampling import SMOTE,ADASYN
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn import metrics
import seaborn as sns
sns.set(color_codes = True)
import warnings
warnings.filterwarnings('ignore')
# Importing 'signal-data.csv' as a DataFrame
df1 = pd.read_csv("C:/Users/ruhee/OneDrive/Desktop/AIML/Featurization, Model Selection and Tuning/signal-data.csv")
# Creating a copy to keep the original data intact
df = df1.copy()
df
| Time | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 100.0 | 97.6133 | 0.1242 | 1.5005 | ... | NaN | 0.5005 | 0.0118 | 0.0035 | 2.3630 | NaN | NaN | NaN | NaN | -1 |
| 1 | 2008-07-19 12:32:00 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 100.0 | 102.3433 | 0.1247 | 1.4966 | ... | 208.2045 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.0096 | 0.0201 | 0.0060 | 208.2045 | -1 |
| 2 | 2008-07-19 13:17:00 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 100.0 | 95.4878 | 0.1241 | 1.4436 | ... | 82.8602 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.0584 | 0.0484 | 0.0148 | 82.8602 | 1 |
| 3 | 2008-07-19 14:43:00 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 100.0 | 104.2367 | 0.1217 | 1.4882 | ... | 73.8432 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| 4 | 2008-07-19 15:22:00 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.0 | 100.3967 | 0.1235 | 1.5031 | ... | NaN | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.0202 | 0.0149 | 0.0044 | 73.8432 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | 2008-10-16 15:13:00 | 2899.41 | 2464.36 | 2179.7333 | 3085.3781 | 1.4843 | 100.0 | 82.2467 | 0.1248 | 1.3424 | ... | 203.1720 | 0.4988 | 0.0143 | 0.0039 | 2.8669 | 0.0068 | 0.0138 | 0.0047 | 203.1720 | -1 |
| 1563 | 2008-10-16 20:49:00 | 3052.31 | 2522.55 | 2198.5667 | 1124.6595 | 0.8763 | 100.0 | 98.4689 | 0.1205 | 1.4333 | ... | NaN | 0.4975 | 0.0131 | 0.0036 | 2.6238 | 0.0068 | 0.0138 | 0.0047 | 203.1720 | -1 |
| 1564 | 2008-10-17 05:26:00 | 2978.81 | 2379.78 | 2206.3000 | 1110.4967 | 0.8236 | 100.0 | 99.4122 | 0.1208 | NaN | ... | 43.5231 | 0.4987 | 0.0153 | 0.0041 | 3.0590 | 0.0197 | 0.0086 | 0.0025 | 43.5231 | -1 |
| 1565 | 2008-10-17 06:01:00 | 2894.92 | 2532.01 | 2177.0333 | 1183.7287 | 1.5726 | 100.0 | 98.7978 | 0.1213 | 1.4622 | ... | 93.4941 | 0.5004 | 0.0178 | 0.0038 | 3.5662 | 0.0262 | 0.0245 | 0.0075 | 93.4941 | -1 |
| 1566 | 2008-10-17 06:07:00 | 2944.92 | 2450.76 | 2195.4444 | 2914.1792 | 1.5978 | 100.0 | 85.1011 | 0.1235 | NaN | ... | 137.7844 | 0.4987 | 0.0181 | 0.0040 | 3.6275 | 0.0117 | 0.0162 | 0.0045 | 137.7844 | -1 |
1567 rows × 592 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 592 entries, Time to Pass/Fail dtypes: float64(590), int64(1), object(1) memory usage: 7.1+ MB
# Printing 5 point summary
df.describe()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1561.000000 | 1560.000000 | 1553.000000 | 1553.000000 | 1553.000000 | 1553.0 | 1553.000000 | 1558.000000 | 1565.000000 | 1565.000000 | ... | 618.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1567.000000 |
| mean | 3014.452896 | 2495.850231 | 2200.547318 | 1396.376627 | 4.197013 | 100.0 | 101.112908 | 0.121822 | 1.462862 | -0.000841 | ... | 97.934373 | 0.500096 | 0.015318 | 0.003847 | 3.067826 | 0.021458 | 0.016475 | 0.005283 | 99.670066 | -0.867262 |
| std | 73.621787 | 80.407705 | 29.513152 | 441.691640 | 56.355540 | 0.0 | 6.237214 | 0.008961 | 0.073897 | 0.015116 | ... | 87.520966 | 0.003404 | 0.017180 | 0.003720 | 3.578033 | 0.012358 | 0.008808 | 0.002867 | 93.891919 | 0.498010 |
| min | 2743.240000 | 2158.750000 | 2060.660000 | 0.000000 | 0.681500 | 100.0 | 82.131100 | 0.000000 | 1.191000 | -0.053400 | ... | 0.000000 | 0.477800 | 0.006000 | 0.001700 | 1.197500 | -0.016900 | 0.003200 | 0.001000 | 0.000000 | -1.000000 |
| 25% | 2966.260000 | 2452.247500 | 2181.044400 | 1081.875800 | 1.017700 | 100.0 | 97.920000 | 0.121100 | 1.411200 | -0.010800 | ... | 46.184900 | 0.497900 | 0.011600 | 0.003100 | 2.306500 | 0.013425 | 0.010600 | 0.003300 | 44.368600 | -1.000000 |
| 50% | 3011.490000 | 2499.405000 | 2201.066700 | 1285.214400 | 1.316800 | 100.0 | 101.512200 | 0.122400 | 1.461600 | -0.001300 | ... | 72.288900 | 0.500200 | 0.013800 | 0.003600 | 2.757650 | 0.020500 | 0.014800 | 0.004600 | 71.900500 | -1.000000 |
| 75% | 3056.650000 | 2538.822500 | 2218.055500 | 1591.223500 | 1.525700 | 100.0 | 104.586700 | 0.123800 | 1.516900 | 0.008400 | ... | 116.539150 | 0.502375 | 0.016500 | 0.004100 | 3.295175 | 0.027600 | 0.020300 | 0.006400 | 114.749700 | -1.000000 |
| max | 3356.350000 | 2846.440000 | 2315.266700 | 3715.041700 | 1114.536600 | 100.0 | 129.252200 | 0.128600 | 1.656400 | 0.074900 | ... | 737.304800 | 0.509800 | 0.476600 | 0.104500 | 99.303200 | 0.102800 | 0.079900 | 0.028600 | 737.304800 | 1.000000 |
8 rows × 591 columns
There are features present in the data whose standard deviation is 0 which implies that all values in these features are the same.
The mean and standard deviation across features vary. This implies that the distribution is not uniform for all features.
The maximum value of many features are extreme which implies that there are many outliers present in the data.
The count of most of the features is not 1567 which implies that there are missing values.
# Writing a for loop which will remove all the features with 20%+ Null values
for col in df.columns:
null_percentage = df[col].isnull().sum() / len(df)
if null_percentage > 0.20:
df = df.drop(col, axis = 1)
# Imputing remaining null values with mean of the feature
df.fillna(df.mean(), inplace=True)
# Identifying the features which are having same value for all the rows
nunique = df.nunique()
# For features having same value for all rows, there will only be 1 unique value for each row
columns_to_drop = nunique[nunique == 1].index
# Dropping the features which are having same value for all the rows
df = df.drop(columns_to_drop, axis = 1)
df.shape
(1567, 444)
df.dtypes.value_counts()
float64 442 object 1 int64 1 dtype: int64
# Deleting 'Time' feature since it is irrelevant
df = df.drop(columns = ['Time'], axis = 1)
df
| 0 | 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | ... | 577 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3030.93 | 2564.00 | 2187.7333 | 1411.1265 | 1.3602 | 97.6133 | 0.1242 | 1.500500 | 0.016200 | -0.003400 | ... | 14.9509 | 0.5005 | 0.0118 | 0.0035 | 2.3630 | 0.021458 | 0.016475 | 0.005283 | 99.670066 | -1 |
| 1 | 3095.78 | 2465.14 | 2230.4222 | 1463.6606 | 0.8294 | 102.3433 | 0.1247 | 1.496600 | -0.000500 | -0.014800 | ... | 10.9003 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.009600 | 0.020100 | 0.006000 | 208.204500 | -1 |
| 2 | 2932.61 | 2559.94 | 2186.4111 | 1698.0172 | 1.5102 | 95.4878 | 0.1241 | 1.443600 | 0.004100 | 0.001300 | ... | 9.2721 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.058400 | 0.048400 | 0.014800 | 82.860200 | 1 |
| 3 | 2988.72 | 2479.90 | 2199.0333 | 909.7926 | 1.3204 | 104.2367 | 0.1217 | 1.488200 | -0.012400 | -0.003300 | ... | 8.5831 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.020200 | 0.014900 | 0.004400 | 73.843200 | -1 |
| 4 | 3032.24 | 2502.87 | 2233.3667 | 1326.5200 | 1.5334 | 100.3967 | 0.1235 | 1.503100 | -0.003100 | -0.007200 | ... | 10.9698 | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.020200 | 0.014900 | 0.004400 | 73.843200 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | 2899.41 | 2464.36 | 2179.7333 | 3085.3781 | 1.4843 | 82.2467 | 0.1248 | 1.342400 | -0.004500 | -0.005700 | ... | 11.7256 | 0.4988 | 0.0143 | 0.0039 | 2.8669 | 0.006800 | 0.013800 | 0.004700 | 203.172000 | -1 |
| 1563 | 3052.31 | 2522.55 | 2198.5667 | 1124.6595 | 0.8763 | 98.4689 | 0.1205 | 1.433300 | -0.006100 | -0.009300 | ... | 17.8379 | 0.4975 | 0.0131 | 0.0036 | 2.6238 | 0.006800 | 0.013800 | 0.004700 | 203.172000 | -1 |
| 1564 | 2978.81 | 2379.78 | 2206.3000 | 1110.4967 | 0.8236 | 99.4122 | 0.1208 | 1.462862 | -0.000841 | 0.000146 | ... | 17.7267 | 0.4987 | 0.0153 | 0.0041 | 3.0590 | 0.019700 | 0.008600 | 0.002500 | 43.523100 | -1 |
| 1565 | 2894.92 | 2532.01 | 2177.0333 | 1183.7287 | 1.5726 | 98.7978 | 0.1213 | 1.462200 | -0.007200 | 0.003200 | ... | 19.2104 | 0.5004 | 0.0178 | 0.0038 | 3.5662 | 0.026200 | 0.024500 | 0.007500 | 93.494100 | -1 |
| 1566 | 2944.92 | 2450.76 | 2195.4444 | 2914.1792 | 1.5978 | 85.1011 | 0.1235 | 1.462862 | -0.000841 | 0.000146 | ... | 22.9183 | 0.4987 | 0.0181 | 0.0040 | 3.6275 | 0.011700 | 0.016200 | 0.004500 | 137.784400 | -1 |
1567 rows × 443 columns
# Calculating variance inflation factor to check multicollinearity
def calc_vif(df):
vif = pd.DataFrame()
vif["variables"] = df.columns
vif["VIF"] = [variance_inflation_factor(df.values,i) for i in range(df.shape[1])]
high_vif = vif[vif["VIF"] > 5].sort_values("VIF",ascending = False)
return(high_vif)
There are no features with VIF > 5.
# Dropping features having correlation > 0.75
# Creating correlation matrix
corr_matrix = df.corr().abs()
# Selecting upper triangle of correlation matrix
upper_traingle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool))
# Finding features with correlation > 0.75
features_to_drop = [column for column in upper_traingle.columns if any(upper_traingle[column] > 0.75)]
# Dropping the features with correlation > 0.75
df.drop(features_to_drop, axis=1, inplace=True)
df.shape
(1567, 213)
Features having correlation > 0.75 were dropped to reduce collinearity.
df['Pass/Fail'].value_counts()
-1 1463 1 104 Name: Pass/Fail, dtype: int64
Target variable is highly imbalanced which has to be dealt later.
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1567.0 | 3014.452896 | 73.480613 | 2743.2400 | 2966.66500 | 3011.8400 | 3056.5400 | 3356.3500 |
| 1 | 1567.0 | 2495.850231 | 80.227793 | 2158.7500 | 2452.88500 | 2498.9100 | 2538.7450 | 2846.4400 |
| 2 | 1567.0 | 2200.547318 | 29.380932 | 2060.6600 | 2181.09995 | 2200.9556 | 2218.0555 | 2315.2667 |
| 3 | 1567.0 | 1396.376627 | 439.712852 | 0.0000 | 1083.88580 | 1287.3538 | 1590.1699 | 3715.0417 |
| 4 | 1567.0 | 4.197013 | 56.103066 | 0.6815 | 1.01770 | 1.3171 | 1.5296 | 1114.5366 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 583 | 1567.0 | 0.015318 | 0.017174 | 0.0060 | 0.01160 | 0.0138 | 0.0165 | 0.4766 |
| 586 | 1567.0 | 0.021458 | 0.012354 | -0.0169 | 0.01345 | 0.0205 | 0.0276 | 0.1028 |
| 587 | 1567.0 | 0.016475 | 0.008805 | 0.0032 | 0.01060 | 0.0148 | 0.0203 | 0.0799 |
| 589 | 1567.0 | 99.670066 | 93.861936 | 0.0000 | 44.36860 | 72.0230 | 114.7497 | 737.3048 |
| Pass/Fail | 1567.0 | -0.867262 | 0.498010 | -1.0000 | -1.00000 | -1.0000 | -1.0000 | 1.0000 |
213 rows × 8 columns
# Separating the target variable
X = df.drop("Pass/Fail", 1)
y = df["Pass/Fail"]
col = list(X.columns)
#Boxplot to check for outliers
X.plot(kind = 'box', subplots = True, layout = (52,5), fontsize = 10, figsize = (15,150));
Almost all features have outliers.
# Finding the outliers and replacing with median
for i in X.columns:
Q1 = X[i].quantile(0.25)
Q3 = X[i].quantile(0.75)
IQR = Q3 - Q1
low = Q1 - 1.5 * IQR
high = Q3 + 1.5 * IQR
X.loc[(X[i] < low) | (X[i] > high), i] = X[i].median()
# Plotting a boxplot after treating the outliers
X.plot(kind = 'box', subplots = True, layout = (52,5), fontsize = 10, figsize = (15,150));
All previous outliers have been treated by replacing them with the median, the ones that are indicated above as new outliers are the ones that have emerged due to replacing the previous outliers with the median.
# Applying Standard Scaler for PCA
scaler = StandardScaler()
XScaled = scaler.fit_transform(X)
XScaled = pd.DataFrame(XScaled, columns = col)
# Applying PCA on the data
pca = PCA()
pca.fit(XScaled)
# Selecting the components that explain 90% of the variance
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.90) + 1
print('The number of components that explain 90% of the variance is', n_components)
The number of components that explain 90% of the variance is 140
# PCA with reduced number of components
pca = PCA(n_components = n_components, random_state = 1)
pca.fit(XScaled)
# Creating a DataFrame of the PCA features
pca_df = pd.DataFrame(pca.fit_transform(XScaled))
pca_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3.644789 | 1.555988 | -0.035591 | 1.236922 | 1.171604 | 0.324878 | -1.524152 | -1.047264 | -1.276991 | 0.056900 | ... | 0.361503 | -0.350125 | -0.389705 | -1.095441 | 0.445512 | -0.339276 | 0.735575 | -0.513619 | 0.319935 | -0.574856 |
| 1 | 1.630083 | 1.438580 | 1.354512 | 0.279991 | -1.262334 | -1.331637 | 0.551455 | 1.578536 | 0.351269 | 0.362444 | ... | 0.961946 | -0.009490 | 0.343129 | 0.836309 | -1.277822 | 0.238706 | 0.721098 | -0.822355 | 0.994382 | 0.686088 |
| 2 | 0.591089 | -0.278087 | 1.674860 | 1.491505 | -1.501514 | 0.561300 | -0.382948 | -0.699674 | -2.133496 | -0.882795 | ... | -0.023015 | -0.239584 | 1.330192 | -0.635392 | -1.176063 | 0.136263 | 0.680117 | 0.708619 | 0.965555 | 0.328532 |
| 3 | 1.622691 | -0.634193 | 1.932515 | 2.074932 | -0.586182 | -1.488348 | -0.252609 | 1.054103 | 0.869499 | 0.582168 | ... | 0.805165 | -0.219751 | -0.965765 | -0.506786 | -0.352051 | 0.469337 | 0.505194 | -1.226161 | -0.532279 | 0.234126 |
| 4 | 3.327173 | 0.725975 | 2.176364 | 0.978885 | -1.287525 | -0.582954 | -2.396269 | 1.257364 | 1.396617 | -0.161709 | ... | -0.087908 | 0.823007 | 0.349860 | 0.025129 | 0.620550 | 0.208983 | -1.139646 | -0.103639 | 0.643188 | 0.909207 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | -3.453129 | -0.382925 | 1.545624 | 1.571793 | 0.733279 | -0.738340 | 6.201610 | 1.254702 | -1.000759 | 2.926702 | ... | 0.654111 | 0.829876 | -0.800959 | -1.159662 | -0.303341 | -0.656271 | 0.338915 | -1.203450 | 1.132289 | -0.365437 |
| 1563 | -1.123529 | -4.706733 | 0.369503 | -2.055760 | 2.008787 | 1.682830 | -2.839609 | 0.780663 | -0.038896 | 0.732798 | ... | -0.472787 | -0.707296 | 0.359716 | -0.429094 | 0.192216 | 0.655569 | -0.175979 | -0.586564 | -0.004743 | -0.929638 |
| 1564 | -3.216324 | 1.560841 | -1.157352 | 0.652980 | -0.337359 | -0.580814 | -1.101839 | 1.325213 | -1.078027 | 2.053913 | ... | 0.402379 | -0.274064 | 0.252251 | -1.296790 | -0.155815 | -0.221857 | 0.874958 | -0.170856 | 0.014300 | -0.224303 |
| 1565 | -4.179351 | 0.001799 | 3.031601 | 0.680018 | 1.415056 | -1.091310 | 4.633881 | -0.725715 | 1.660152 | 1.751513 | ... | -1.541391 | 0.115685 | -0.515081 | -0.246566 | -0.428123 | 0.287328 | -0.391627 | -0.296874 | 0.134970 | 1.197429 |
| 1566 | -3.518494 | -3.407988 | 2.524325 | -0.068791 | -0.408080 | 0.160871 | 1.943270 | -0.154048 | 1.560751 | 0.921114 | ... | -0.501392 | 0.480619 | 1.008398 | -0.119960 | -0.014804 | 1.143502 | 0.920672 | -1.226346 | 0.097940 | 0.824647 |
1567 rows × 140 columns
Modifications made:
Features having 20%+ null values were dropped and null values of remaining features were replaced with mean of the features.
All features having the same value for all rows (features with 0 standard deviation) were dropped.
'Time' feature was removed since it is irrelevant to predict the Pass/Fail yield of a particular process.
Multicollinearity was checked using variance inflation factor but there were no features with vif > 5 so no features were eliminated.
Features having correlation higher than 0.75 were dropped since it implies collinearity.
All previous outliers were treated by replacing them with the median, the ones that are indicated as new outliers are the ones that have emerged due to replacing the previous outliers with the median.
Data was scaled using StandardScaler for PCA and number of components that explain 90% of variance of the data were selected.
# Univariate Analysis
# Plotting histogram to check for the frequency of values within a feature
df.hist(bins = 30, figsize = (40, 40))
plt.show()
Many features seem to have a normal distribution.
There are features that have bars that are much tallers than the rest indicating extreme values.
# Bivariate Analysis
sns.jointplot(data = df, x = "15", y = "51", hue = "Pass/Fail", kind = "kde");
sns.jointplot(data = df, x = "200", y = "75", hue = "Pass/Fail", kind = "kde");
sns.scatterplot(data = df, x = "175", y = "6", hue = "Pass/Fail");
sns.scatterplot(data = df, x = "48", y = "12", hue = "Pass/Fail");
All above features have high proportion of failures.
# Multivariate Analysis
plt.figure(figsize = (20,18))
corr = df.corr()
sns.heatmap(corr);
There are areas on the heat map where correlation is high. Most features have low correlation.
# Segregate predictors vs target attributes
# The original df was split into the target variable 'y' and dependent variables 'X'
# The dependent variables were standardized 'XScaled' and PCA was applied
# pca_df consists of the standardized dependent variables that explain 90% variance of the data
# The target variable y is 'Pass/Fail'
# Predictors = pca_df
# Target attributes = y
pca_df
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3.644789 | 1.555988 | -0.035591 | 1.236922 | 1.171604 | 0.324878 | -1.524152 | -1.047264 | -1.276991 | 0.056900 | ... | 0.361503 | -0.350125 | -0.389705 | -1.095441 | 0.445512 | -0.339276 | 0.735575 | -0.513619 | 0.319935 | -0.574856 |
| 1 | 1.630083 | 1.438580 | 1.354512 | 0.279991 | -1.262334 | -1.331637 | 0.551455 | 1.578536 | 0.351269 | 0.362444 | ... | 0.961946 | -0.009490 | 0.343129 | 0.836309 | -1.277822 | 0.238706 | 0.721098 | -0.822355 | 0.994382 | 0.686088 |
| 2 | 0.591089 | -0.278087 | 1.674860 | 1.491505 | -1.501514 | 0.561300 | -0.382948 | -0.699674 | -2.133496 | -0.882795 | ... | -0.023015 | -0.239584 | 1.330192 | -0.635392 | -1.176063 | 0.136263 | 0.680117 | 0.708619 | 0.965555 | 0.328532 |
| 3 | 1.622691 | -0.634193 | 1.932515 | 2.074932 | -0.586182 | -1.488348 | -0.252609 | 1.054103 | 0.869499 | 0.582168 | ... | 0.805165 | -0.219751 | -0.965765 | -0.506786 | -0.352051 | 0.469337 | 0.505194 | -1.226161 | -0.532279 | 0.234126 |
| 4 | 3.327173 | 0.725975 | 2.176364 | 0.978885 | -1.287525 | -0.582954 | -2.396269 | 1.257364 | 1.396617 | -0.161709 | ... | -0.087908 | 0.823007 | 0.349860 | 0.025129 | 0.620550 | 0.208983 | -1.139646 | -0.103639 | 0.643188 | 0.909207 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | -3.453129 | -0.382925 | 1.545624 | 1.571793 | 0.733279 | -0.738340 | 6.201610 | 1.254702 | -1.000759 | 2.926702 | ... | 0.654111 | 0.829876 | -0.800959 | -1.159662 | -0.303341 | -0.656271 | 0.338915 | -1.203450 | 1.132289 | -0.365437 |
| 1563 | -1.123529 | -4.706733 | 0.369503 | -2.055760 | 2.008787 | 1.682830 | -2.839609 | 0.780663 | -0.038896 | 0.732798 | ... | -0.472787 | -0.707296 | 0.359716 | -0.429094 | 0.192216 | 0.655569 | -0.175979 | -0.586564 | -0.004743 | -0.929638 |
| 1564 | -3.216324 | 1.560841 | -1.157352 | 0.652980 | -0.337359 | -0.580814 | -1.101839 | 1.325213 | -1.078027 | 2.053913 | ... | 0.402379 | -0.274064 | 0.252251 | -1.296790 | -0.155815 | -0.221857 | 0.874958 | -0.170856 | 0.014300 | -0.224303 |
| 1565 | -4.179351 | 0.001799 | 3.031601 | 0.680018 | 1.415056 | -1.091310 | 4.633881 | -0.725715 | 1.660152 | 1.751513 | ... | -1.541391 | 0.115685 | -0.515081 | -0.246566 | -0.428123 | 0.287328 | -0.391627 | -0.296874 | 0.134970 | 1.197429 |
| 1566 | -3.518494 | -3.407988 | 2.524325 | -0.068791 | -0.408080 | 0.160871 | 1.943270 | -0.154048 | 1.560751 | 0.921114 | ... | -0.501392 | 0.480619 | 1.008398 | -0.119960 | -0.014804 | 1.143502 | 0.920672 | -1.226346 | 0.097940 | 0.824647 |
1567 rows × 140 columns
# We already know the target variable is imbalanced as seen earlier
y.value_counts()
-1 1463 1 104 Name: Pass/Fail, dtype: int64
# Splitting the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(pca_df, y, test_size = 0.2, random_state = 1, stratify = y)
X_train.shape, X_test.shape
((1253, 140), (314, 140))
# Oversampling the minority class using SMOTE to deal with the imbalance
smote = SMOTE()
Xs_train, ys_train = smote.fit_resample(X_train, y_train)
Xs_test, ys_test = smote.fit_resample(X_test, y_test)
ys_train.value_counts()
-1 1170 1 1170 Name: Pass/Fail, dtype: int64
The data was already standarized earlier for performing PCA. Data has been split into train and test test. Target variable has been balanced using SMOTE.
# 5 point summary to see statistical characteristics of original data
df1.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1561.0 | 3014.452896 | 73.621787 | 2743.2400 | 2966.260000 | 3011.4900 | 3056.6500 | 3356.3500 |
| 1 | 1560.0 | 2495.850231 | 80.407705 | 2158.7500 | 2452.247500 | 2499.4050 | 2538.8225 | 2846.4400 |
| 2 | 1553.0 | 2200.547318 | 29.513152 | 2060.6600 | 2181.044400 | 2201.0667 | 2218.0555 | 2315.2667 |
| 3 | 1553.0 | 1396.376627 | 441.691640 | 0.0000 | 1081.875800 | 1285.2144 | 1591.2235 | 3715.0417 |
| 4 | 1553.0 | 4.197013 | 56.355540 | 0.6815 | 1.017700 | 1.3168 | 1.5257 | 1114.5366 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 586 | 1566.0 | 0.021458 | 0.012358 | -0.0169 | 0.013425 | 0.0205 | 0.0276 | 0.1028 |
| 587 | 1566.0 | 0.016475 | 0.008808 | 0.0032 | 0.010600 | 0.0148 | 0.0203 | 0.0799 |
| 588 | 1566.0 | 0.005283 | 0.002867 | 0.0010 | 0.003300 | 0.0046 | 0.0064 | 0.0286 |
| 589 | 1566.0 | 99.670066 | 93.891919 | 0.0000 | 44.368600 | 71.9005 | 114.7497 | 737.3048 |
| Pass/Fail | 1567.0 | -0.867262 | 0.498010 | -1.0000 | -1.000000 | -1.0000 | -1.0000 | 1.0000 |
591 rows × 8 columns
# 5 point summary to see statistical characteristics of train data
Xs_train.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2340.0 | 0.509826 | 2.739167 | -5.974927 | -1.959969 | 0.787019 | 2.781919 | 6.600783 |
| 1 | 2340.0 | 0.011979 | 2.181731 | -5.982382 | -1.691015 | -0.002149 | 1.617126 | 6.776613 |
| 2 | 2340.0 | 0.033354 | 2.016452 | -5.332839 | -1.410070 | -0.099253 | 1.321127 | 6.284217 |
| 3 | 2340.0 | -0.176262 | 1.846292 | -5.337439 | -1.497728 | -0.217744 | 0.965192 | 5.987990 |
| 4 | 2340.0 | -0.410338 | 1.747355 | -5.157398 | -1.727098 | -0.476412 | 0.865765 | 5.859107 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 135 | 2340.0 | -0.021435 | 0.685369 | -2.383451 | -0.500525 | -0.003281 | 0.451413 | 2.943416 |
| 136 | 2340.0 | 0.055353 | 0.640054 | -2.621613 | -0.368400 | 0.067676 | 0.463996 | 2.644035 |
| 137 | 2340.0 | -0.008076 | 0.686434 | -1.978793 | -0.463152 | -0.007342 | 0.391370 | 2.223083 |
| 138 | 2340.0 | -0.071284 | 0.620149 | -2.263248 | -0.460480 | -0.084486 | 0.319499 | 2.377305 |
| 139 | 2340.0 | -0.022930 | 0.681311 | -2.202008 | -0.485036 | -0.065182 | 0.444435 | 2.339492 |
140 rows × 8 columns
# 5 point summary to see statistical characteristics of test data
Xs_test.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 586.0 | 0.494587 | 2.914676 | -5.906527 | -2.146418 | 0.770109 | 2.695349 | 6.696636 |
| 1 | 586.0 | -0.109703 | 2.279372 | -5.760817 | -1.804292 | -0.160773 | 1.565377 | 5.785719 |
| 2 | 586.0 | 0.518418 | 1.932304 | -5.327931 | -0.884852 | 0.881364 | 1.880651 | 5.566962 |
| 3 | 586.0 | -0.337078 | 1.768363 | -5.250127 | -1.420789 | -0.595462 | 0.722379 | 4.798479 |
| 4 | 586.0 | -0.524400 | 1.787993 | -4.956158 | -1.715628 | -0.667964 | 0.688189 | 5.770369 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 135 | 586.0 | -0.113543 | 0.635953 | -1.960590 | -0.568026 | -0.109050 | 0.249153 | 2.244031 |
| 136 | 586.0 | -0.006345 | 0.653757 | -2.073975 | -0.437462 | -0.048978 | 0.443159 | 2.320330 |
| 137 | 586.0 | -0.010662 | 0.594355 | -1.883108 | -0.454769 | 0.014451 | 0.346173 | 1.667919 |
| 138 | 586.0 | -0.037996 | 0.648089 | -1.969560 | -0.466038 | -0.013769 | 0.372928 | 2.463277 |
| 139 | 586.0 | -0.002268 | 0.609353 | -2.017164 | -0.395440 | 0.004295 | 0.387580 | 1.697605 |
140 rows × 8 columns
The original data has very extrmeme values. As we can see the mean ranges from close to 0 to approximately 3000. The standard deviation also ranges from close to 0 to approximately 73.
The data is scaled in the train and test data due to the extreme values found in the original data. The mean and standard deviation from the train and test data seems to be pretty close to each other with a few exceptions which maybe due to different sizes of train and test data. Q1, median and Q3 values also seem to be close to each other in the train and test data.
This implies that both train and test data are relatively similar.
# Training a model using Gaussian Naive Bayes
nb_model_pca = GaussianNB()
nb_model_pca.fit(Xs_train, ys_train)
GaussianNB()
# Performance of model with training data
# Predicting the response
prediction = nb_model_pca.predict(Xs_train)
# Accuracy
nb_model_pca_training_acc = nb_model_pca.score(Xs_train, ys_train)
print(nb_model_pca.score(Xs_train, ys_train))
0.9098290598290598
# Performance of model with test data
# Predicting the response
prediction = nb_model_pca.predict(Xs_test)
# Accuracy
nb_model_pca_testing_acc = nb_model_pca.score(Xs_test, ys_test)
print(nb_model_pca.score(Xs_test, ys_test))
0.7986348122866894
# Confusion Matrix
print("Confusion Matrix:\n",confusion_matrix(ys_test, prediction))
Confusion Matrix: [[283 10] [108 185]]
# Classification Report
print('Classification Report:\n',classification_report(ys_test, prediction))
Classification Report:
precision recall f1-score support
-1 0.72 0.97 0.83 293
1 0.95 0.63 0.76 293
accuracy 0.80 586
macro avg 0.84 0.80 0.79 586
weighted avg 0.84 0.80 0.79 586
# Visualization of confusion matrix in the form of a heatmap
cm = confusion_matrix(ys_test, prediction)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
# Using k fold cross validation
num_folds = 50
kfold_pca = KFold(n_splits = num_folds)
prediction = cross_val_score(nb_model_pca, Xs_train, ys_train, cv = kfold_pca)
kfold_pca_training_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 90.389% (4.373%)
prediction = cross_val_score(nb_model_pca, Xs_test, ys_test, cv = kfold_pca)
kfold_pca_testing_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 95.545% (4.946%)
# Using LOOCV cross validation
# LOOCV is same as k folds if k value is taken to be maximum number of data points
num_folds = 140
loocv_pca = KFold(n_splits = num_folds)
prediction = cross_val_score(nb_model_pca, Xs_train, ys_train, cv = loocv_pca)
loocv_pca_training_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 90.286% (6.730%)
prediction = cross_val_score(nb_model_pca, Xs_test, ys_test, cv = loocv_pca)
loocv_pca_testing_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 95.643% (9.370%)
# Stratified cross validation
stratified_kfold_pca = StratifiedKFold(n_splits = 10, random_state = 55, shuffle = True)
prediction = cross_val_score(nb_model_pca, Xs_train, ys_train, cv = stratified_kfold_pca)
stratified_kfold_pca_training_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 90.299% (1.863%)
prediction = cross_val_score(nb_model_pca, Xs_test, ys_test, cv = stratified_kfold_pca)
stratified_kfold_pca_testing_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 95.564% (3.067%)
# Grid Search
param_grid_nb = {
'var_smoothing': np.logspace(0,-9, num = 100)
}
# Best Accuracy was seen with LOOCV cross validation
nb_model_grid = GridSearchCV(estimator = nb_model_pca, param_grid = param_grid_nb, verbose = 1, cv = loocv_pca,
n_jobs = -1, scoring = 'accuracy')
nb_model_grid.fit(Xs_train, ys_train)
best_parameters = nb_model_grid.best_params_
print("Best Hyper Parameters:\n", nb_model_grid.best_params_)
Fitting 140 folds for each of 100 candidates, totalling 14000 fits
Best Hyper Parameters:
{'var_smoothing': 0.0023101297000831605}
# Building the tuned nb model
tuned_nb_pca = GaussianNB(**best_parameters)
tuned_nb_pca.fit(Xs_train, ys_train)
# Predicting the response
prediction = tuned_nb_pca.predict(Xs_train)
# Accuracy
tuned_nb_pca_training_acc = tuned_nb_pca.score(Xs_train, ys_train)
print(tuned_nb_pca.score(Xs_train, ys_train))
0.911965811965812
prediction = tuned_nb_pca.predict(Xs_test)
tuned_nb_pca_testing_acc = tuned_nb_pca.score(Xs_test, ys_test)
print(tuned_nb_pca.score(Xs_test, ys_test))
0.8139931740614335
# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(ys_test, prediction))
# Classification Report
print('Classification Report:\n', classification_report(ys_test, prediction))
# Visualization of confusion matrix in the form of a heatmap
cm = confusion_matrix(ys_test, prediction)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
Confusion Matrix:
[[281 12]
[ 97 196]]
Classification Report:
precision recall f1-score support
-1 0.74 0.96 0.84 293
1 0.94 0.67 0.78 293
accuracy 0.81 586
macro avg 0.84 0.81 0.81 586
weighted avg 0.84 0.81 0.81 586
classifiers = [['Naive Bayes :', GaussianNB()],
['KNeighbours :', KNeighborsClassifier()],
['SVM :', SVC()],
['LogisticRegression :', LogisticRegression()],
['DecisionTree :',DecisionTreeClassifier()],
['RandomForest :',RandomForestClassifier()],
['AdaBoosting :',AdaBoostClassifier()],
['GradientBoosting :',GradientBoostingClassifier()]
]
for name,classifier in classifiers:
clf = classifier.fit(Xs_train,ys_train)
y_pred = classifier.predict(Xs_test)
print(f'\n {name} \n')
print(f'Training Score for {name} {clf.score(Xs_train,ys_train) * 100:.2f}' )
print(f'Testing Score for {name} {clf.score(Xs_test,ys_test) * 100:.2f}' )
print(f'Classification report \n {classification_report(ys_test,y_pred)}' )
print(f'Confusion matrix \n {confusion_matrix(ys_test,y_pred)}' )
print(f'ROC AUC : {roc_auc_score(ys_test,y_pred)}' )
Naive Bayes :
Training Score for Naive Bayes : 90.98
Testing Score for Naive Bayes : 79.86
Classification report
precision recall f1-score support
-1 0.72 0.97 0.83 293
1 0.95 0.63 0.76 293
accuracy 0.80 586
macro avg 0.84 0.80 0.79 586
weighted avg 0.84 0.80 0.79 586
Confusion matrix
[[283 10]
[108 185]]
ROC AUC : 0.7986348122866894
KNeighbours :
Training Score for KNeighbours : 60.94
Testing Score for KNeighbours : 57.17
Classification report
precision recall f1-score support
-1 1.00 0.14 0.25 293
1 0.54 1.00 0.70 293
accuracy 0.57 586
macro avg 0.77 0.57 0.48 586
weighted avg 0.77 0.57 0.48 586
Confusion matrix
[[ 42 251]
[ 0 293]]
ROC AUC : 0.5716723549488054
SVM :
Training Score for SVM : 100.00
Testing Score for SVM : 61.43
Classification report
precision recall f1-score support
-1 0.56 1.00 0.72 293
1 1.00 0.23 0.37 293
accuracy 0.61 586
macro avg 0.78 0.61 0.55 586
weighted avg 0.78 0.61 0.55 586
Confusion matrix
[[293 0]
[226 67]]
ROC AUC : 0.6143344709897611
LogisticRegression :
Training Score for LogisticRegression : 87.39
Testing Score for LogisticRegression : 57.34
Classification report
precision recall f1-score support
-1 0.55 0.78 0.65 293
1 0.62 0.37 0.46 293
accuracy 0.57 586
macro avg 0.59 0.57 0.55 586
weighted avg 0.59 0.57 0.55 586
Confusion matrix
[[228 65]
[185 108]]
ROC AUC : 0.5733788395904437
DecisionTree :
Training Score for DecisionTree : 100.00
Testing Score for DecisionTree : 59.73
Classification report
precision recall f1-score support
-1 0.56 0.86 0.68 293
1 0.71 0.33 0.45 293
accuracy 0.60 586
macro avg 0.63 0.60 0.57 586
weighted avg 0.63 0.60 0.57 586
Confusion matrix
[[252 41]
[195 98]]
ROC AUC : 0.5972696245733788
RandomForest :
Training Score for RandomForest : 100.00
Testing Score for RandomForest : 53.41
Classification report
precision recall f1-score support
-1 0.52 1.00 0.68 293
1 1.00 0.07 0.13 293
accuracy 0.53 586
macro avg 0.76 0.53 0.40 586
weighted avg 0.76 0.53 0.40 586
Confusion matrix
[[293 0]
[273 20]]
ROC AUC : 0.5341296928327645
AdaBoosting :
Training Score for AdaBoosting : 94.53
Testing Score for AdaBoosting : 73.89
Classification report
precision recall f1-score support
-1 0.69 0.88 0.77 293
1 0.83 0.60 0.70 293
accuracy 0.74 586
macro avg 0.76 0.74 0.73 586
weighted avg 0.76 0.74 0.73 586
Confusion matrix
[[258 35]
[118 175]]
ROC AUC : 0.7389078498293515
GradientBoosting :
Training Score for GradientBoosting : 99.87
Testing Score for GradientBoosting : 65.87
Classification report
precision recall f1-score support
-1 0.60 0.95 0.73 293
1 0.87 0.37 0.52 293
accuracy 0.66 586
macro avg 0.74 0.66 0.63 586
weighted avg 0.74 0.66 0.63 586
Confusion matrix
[[277 16]
[184 109]]
ROC AUC : 0.658703071672355
# Applying PCA on the data
pca = PCA()
pca.fit(pca_df)
# Selecting the components that explain 90% of the variance
n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.90) + 1
print('The number of components that explain 90% of the variance is', n_components)
The number of components that explain 90% of the variance is 110
# PCA with reduced number of components
pca = PCA(n_components = n_components, random_state = 1)
pca.fit(pca_df)
# Creating a DataFrame of the PCA features
pca_df2 = pd.DataFrame(pca.fit_transform(pca_df))
# Splitting the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(pca_df2, y, test_size = 0.2, random_state = 1, stratify = y)
X_train.shape, X_test.shape
((1253, 110), (314, 110))
# Oversampling the minority class using SMOTE to deal with the imbalance
smote = SMOTE()
Xs_train, ys_train = smote.fit_resample(X_train, y_train)
Xs_test, ys_test = smote.fit_resample(X_test, y_test)
ys_train.value_counts()
-1 1170 1 1170 Name: Pass/Fail, dtype: int64
classifiers = [['Naive Bayes after PCA :', GaussianNB()],
['KNeighbours after PCA :', KNeighborsClassifier()],
['SVM after PCA :', SVC()],
['LogisticRegression after PCA :', LogisticRegression()],
['DecisionTree after PCA :',DecisionTreeClassifier()],
['RandomForest after PCA :',RandomForestClassifier()],
['AdaBoosting after PCA :',AdaBoostClassifier()],
['GradientBoosting after PCA :',GradientBoostingClassifier()]
]
for name,classifier in classifiers:
clf = classifier.fit(Xs_train,ys_train)
y_pred = classifier.predict(Xs_test)
print(f'\n {name} \n')
print(f'Training Score for {name} {clf.score(Xs_train,ys_train) * 100:.2f}' )
print(f'Testing Score for {name} {clf.score(Xs_test,ys_test) * 100:.2f}' )
print(f'Classification report \n {classification_report(ys_test,y_pred)}' )
print(f'Confusion matrix \n {confusion_matrix(ys_test,y_pred)}' )
print(f'ROC AUC : {roc_auc_score(ys_test,y_pred)}' )
Naive Bayes after PCA :
Training Score for Naive Bayes after PCA : 89.74
Testing Score for Naive Bayes after PCA : 80.72
Classification report
precision recall f1-score support
-1 0.74 0.96 0.83 293
1 0.94 0.66 0.77 293
accuracy 0.81 586
macro avg 0.84 0.81 0.80 586
weighted avg 0.84 0.81 0.80 586
Confusion matrix
[[280 13]
[100 193]]
ROC AUC : 0.8071672354948806
KNeighbours after PCA :
Training Score for KNeighbours after PCA : 64.06
Testing Score for KNeighbours after PCA : 59.90
Classification report
precision recall f1-score support
-1 1.00 0.20 0.33 293
1 0.55 1.00 0.71 293
accuracy 0.60 586
macro avg 0.78 0.60 0.52 586
weighted avg 0.78 0.60 0.52 586
Confusion matrix
[[ 58 235]
[ 0 293]]
ROC AUC : 0.598976109215017
SVM after PCA :
Training Score for SVM after PCA : 100.00
Testing Score for SVM after PCA : 58.70
Classification report
precision recall f1-score support
-1 0.55 1.00 0.71 293
1 1.00 0.17 0.30 293
accuracy 0.59 586
macro avg 0.77 0.59 0.50 586
weighted avg 0.77 0.59 0.50 586
Confusion matrix
[[293 0]
[242 51]]
ROC AUC : 0.5870307167235495
LogisticRegression after PCA :
Training Score for LogisticRegression after PCA : 85.64
Testing Score for LogisticRegression after PCA : 53.92
Classification report
precision recall f1-score support
-1 0.53 0.81 0.64 293
1 0.59 0.27 0.37 293
accuracy 0.54 586
macro avg 0.56 0.54 0.50 586
weighted avg 0.56 0.54 0.50 586
Confusion matrix
[[237 56]
[214 79]]
ROC AUC : 0.5392491467576792
DecisionTree after PCA :
Training Score for DecisionTree after PCA : 100.00
Testing Score for DecisionTree after PCA : 56.48
Classification report
precision recall f1-score support
-1 0.54 0.85 0.66 293
1 0.65 0.28 0.39 293
accuracy 0.56 586
macro avg 0.59 0.56 0.53 586
weighted avg 0.59 0.56 0.53 586
Confusion matrix
[[248 45]
[210 83]]
ROC AUC : 0.5648464163822525
RandomForest after PCA :
Training Score for RandomForest after PCA : 100.00
Testing Score for RandomForest after PCA : 53.07
Classification report
precision recall f1-score support
-1 0.52 1.00 0.68 293
1 1.00 0.06 0.12 293
accuracy 0.53 586
macro avg 0.76 0.53 0.40 586
weighted avg 0.76 0.53 0.40 586
Confusion matrix
[[293 0]
[275 18]]
ROC AUC : 0.5307167235494881
AdaBoosting after PCA :
Training Score for AdaBoosting after PCA : 93.03
Testing Score for AdaBoosting after PCA : 75.26
Classification report
precision recall f1-score support
-1 0.69 0.90 0.78 293
1 0.86 0.60 0.71 293
accuracy 0.75 586
macro avg 0.78 0.75 0.75 586
weighted avg 0.78 0.75 0.75 586
Confusion matrix
[[264 29]
[116 177]]
ROC AUC : 0.7525597269624573
GradientBoosting after PCA :
Training Score for GradientBoosting after PCA : 99.70
Testing Score for GradientBoosting after PCA : 64.85
Classification report
precision recall f1-score support
-1 0.59 0.96 0.73 293
1 0.88 0.34 0.49 293
accuracy 0.65 586
macro avg 0.74 0.65 0.61 586
weighted avg 0.74 0.65 0.61 586
Confusion matrix
[[280 13]
[193 100]]
ROC AUC : 0.6484641638225256
# Training a model using Gaussian Naive Bayes
nb_model_pca2 = GaussianNB()
nb_model_pca2.fit(Xs_train, ys_train)
GaussianNB()
# Performance of model with training data
# Predicting the response
prediction = nb_model_pca2.predict(Xs_train)
# Accuracy
nb_model_pca2_training_acc = nb_model_pca2.score(Xs_train, ys_train)
print(nb_model_pca2.score(Xs_train, ys_train))
0.8974358974358975
# Performance of model with test data
# Predicting the response
prediction = nb_model_pca2.predict(Xs_test)
# Accuracy
nb_model_pca2_testing_acc = nb_model_pca2.score(Xs_test, ys_test)
print(nb_model_pca2.score(Xs_test, ys_test))
0.8071672354948806
# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(ys_test, prediction))
# Classification Report
print('Classification Report:\n', classification_report(ys_test, prediction))
# Visualization of confusion matrix in the form of a heatmap
cm = confusion_matrix(ys_test, prediction)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
Confusion Matrix:
[[280 13]
[100 193]]
Classification Report:
precision recall f1-score support
-1 0.74 0.96 0.83 293
1 0.94 0.66 0.77 293
accuracy 0.81 586
macro avg 0.84 0.81 0.80 586
weighted avg 0.84 0.81 0.80 586
# Using k fold cross validation
num_folds = 50
kfold_pca2 = KFold(n_splits = num_folds)
prediction = cross_val_score(nb_model_pca2, Xs_train, ys_train, cv = kfold_pca2)
kfold_pca2_training_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 89.348% (4.702%)
prediction = cross_val_score(nb_model_pca2, Xs_test, ys_test, cv = kfold_pca2)
kfold_pca2_testing_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 96.742% (4.842%)
# Using LOOCV cross validation
# LOOCV is same as k folds if k value is taken to be maximum number of data points
num_folds = 110
loocv_pca2 = KFold(n_splits = num_folds)
prediction = cross_val_score(nb_model_pca2, Xs_train, ys_train, cv = loocv_pca2)
loocv_pca2_training_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 89.299% (6.542%)
prediction = cross_val_score(nb_model_pca2, Xs_test, ys_test, cv = loocv_pca2)
loocv_pca2_testing_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 96.848% (7.147%)
# Stratified cross validation
stratified_kfold_pca2 = StratifiedKFold(n_splits = 10, random_state = 55, shuffle = True)
prediction = cross_val_score(nb_model_pca2, Xs_train, ys_train, cv = stratified_kfold_pca2)
stratified_kfold_pca2_training_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 89.359% (2.115%)
prediction = cross_val_score(nb_model_pca2, Xs_test, ys_test, cv = stratified_kfold_pca2)
stratified_kfold_pca2_testing_acc = prediction.mean()*100.0, prediction.std()*100.0
print("Accuracy: %.3f%% (%.3f%%)" % (prediction.mean()*100.0, prediction.std()*100.0))
Accuracy: 96.932% (2.251%)
# Grid Search
param_grid_nb = {
'var_smoothing': np.logspace(0, -9, num = 100)
}
# Best Accuracy was seen with stratified kfold cross validation
nb_model_grid = GridSearchCV(estimator = nb_model_pca2, param_grid = param_grid_nb, verbose = 1, cv = stratified_kfold_pca2,
n_jobs = -1, scoring = 'accuracy')
nb_model_grid.fit(Xs_train, ys_train)
best_parameters = nb_model_grid.best_params_
print("Best Hyper Parameters:\n", nb_model_grid.best_params_)
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best Hyper Parameters:
{'var_smoothing': 0.006579332246575682}
# Building the tuned nb model
tuned_nb_pca2 = GaussianNB(**best_parameters)
tuned_nb_pca2.fit(Xs_train, ys_train)
# Predicting the response
prediction = tuned_nb_pca2.predict(Xs_train)
# Accuracy
tuned_nb_pca2_training_acc = tuned_nb_pca2.score(Xs_train, ys_train)
print(tuned_nb_pca2.score(Xs_train, ys_train))
0.9055555555555556
prediction = tuned_nb_pca2.predict(Xs_test)
tuned_nb_pca2_testing_acc = tuned_nb_pca2.score(Xs_test, ys_test)
print(f'Testing Accuracy : {tuned_nb_pca2.score(Xs_test, ys_test) * 100:.2f}' )
Testing Accuracy : 82.76
# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(ys_test, prediction))
# Classification Report
print('Classification Report:\n', classification_report(ys_test, prediction))
# Visualization of confusion matrix in the form of a heatmap
cm = confusion_matrix(ys_test, prediction)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
Confusion Matrix:
[[273 20]
[ 81 212]]
Classification Report:
precision recall f1-score support
-1 0.77 0.93 0.84 293
1 0.91 0.72 0.81 293
accuracy 0.83 586
macro avg 0.84 0.83 0.83 586
weighted avg 0.84 0.83 0.83 586
print(nb_model_pca_training_acc)
print(nb_model_pca_testing_acc)
print(kfold_pca_training_acc)
print(kfold_pca_testing_acc)
print(loocv_pca_training_acc)
print(loocv_pca_testing_acc)
print(stratified_kfold_pca_training_acc)
print(stratified_kfold_pca_testing_acc)
print(tuned_nb_pca_training_acc)
print(tuned_nb_pca_testing_acc)
print(nb_model_pca2_training_acc)
print(nb_model_pca2_testing_acc)
print(kfold_pca2_training_acc)
print(kfold_pca2_testing_acc)
print(loocv_pca2_training_acc)
print(loocv_pca2_testing_acc)
print(stratified_kfold_pca2_training_acc)
print(stratified_kfold_pca2_testing_acc)
print(tuned_nb_pca2_training_acc)
print(tuned_nb_pca2_testing_acc)
0.9098290598290598 0.7986348122866894 (90.38852913968547, 4.372692486716284) (95.54545454545453, 4.946082105161713) (90.28623949579833, 6.730346934233625) (95.64285714285715, 9.370219869787649) (90.2991452991453, 1.8632674663753548) (95.56399766218587, 3.067401251815317) 0.911965811965812 0.8139931740614335 0.8974358974358975 0.8071672354948806 (89.34782608695652, 4.701621328355814) (96.74242424242425, 4.841970054931757) (89.29948839039747, 6.542288707710739) (96.84848484848486, 7.146891160488693) (89.35897435897436, 2.1148449626766883) (96.93161893629456, 2.251360234735413) 0.9055555555555556 0.8276450511945392
The model with the best accuracy is the stratified kfold Naive Bayes model done after doing PCA for the second time. Therefore, this is the best model. All other models have lower accuracies.
import pickle
#our best model has been
best_model = nb_model_pca2
best_model.fit(Xs_train, ys_train)
# Save the trained model as a pickle string.
saved_model = pickle.dumps(best_model)
# Save to file in the current working directory
pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
pickle.dump(best_model, file)
The original dataframe has 1567 rows and 592 columns. After removing features with 20% + null values, 'Time' feature, features with high correlation and doing PCA, the data was reduced to 140 features. Models were built on this data. PCA was done again features were reduced to 110. Models were built again on this data. The best result was seen with stratified kfold Naive Bayes after doing PCA for the second time.